Modeling US Census data

R25 Modelers and Story Tellers

Author

Drs. Hua Zhou and Roch Nianogo

Published

July 21, 2023

1 Introduction

In this tutorial, we learn to:

  • perform segregation analysis

  • regression analysis

Resources: Analyzing US Census Data Chapter 8 and R for Data Science.

2 Segregation and diversity

library(tidycensus)
library(tidyverse)
library(segregation)
library(tigris)
library(sf)

# Get California tract data by race/ethnicity
ca_acs_data <- get_acs(
  geography = "tract",
  variables = c(
    white = "B03002_003",
    black = "B03002_004",
    asian = "B03002_006",
    hispanic = "B03002_012"
  ), 
  state = "CA",
  geometry = TRUE,
  year = 2019
) 

# Use tidycensus to get urbanized areas by population with geometry, 
# then filter for those that have populations of 750,000 or more
us_urban_areas <- get_acs(
  geography = "urban area",
  variables = "B01001_001",
  geometry = TRUE,
  year = 2019,
  survey = "acs1"
) %>%
  filter(estimate >= 750000) %>%
  transmute(urban_name = str_remove(NAME, 
                                    fixed(", CA Urbanized Area (2010)")))

# Compute an inner spatial join between the California tracts and the 
# urbanized areas, returning tracts in the largest California urban 
# areas with the urban_name column appended
ca_urban_data <- ca_acs_data %>%
  st_join(us_urban_areas, left = FALSE) %>% # join = st_intersects by default
  select(-NAME) %>%
  st_drop_geometry()

2.1 White-Hispanic dissimilarity index

Dissimilarity index \[ D = \frac 12 \sum_{i=1}^N | \frac{a_i}{A} - \frac{b_i}{B}|, \] where \(a_i\) represents the population of group \(A\) in a given areal unit \(i\); A is the total population of that group in the study region (e.g. a metropolitan area); and \(b_i\) and \(B\) are the equivalent metrics for the second group. \(D=0\) indicates perfect integration and \(D=1\) means complete segregation.

Dissimilarity in the LA metro area:

ca_urban_data %>%
  filter(variable %in% c("white", "hispanic"),
         urban_name == "Los Angeles--Long Beach--Anaheim") %>%
  dissimilarity(
    group = "variable", # race groups
    unit = "GEOID",     # geographic regions
    weight = "estimate"
  )
   stat       est
1:    D 0.5999229

Compute the dissimilarity for each urban areas in California.

ca_urban_data %>%
  filter(variable %in% c("white", "hispanic")) %>%
  group_by(urban_name) %>%
  group_modify(~
    dissimilarity(.x,
      group = "variable",
      unit = "GEOID",
      weight = "estimate"
    )
  ) %>% 
  arrange(desc(est)) %>%
  print()
# A tibble: 6 × 3
# Groups:   urban_name [6]
  urban_name                       stat    est
  <chr>                            <chr> <dbl>
1 Los Angeles--Long Beach--Anaheim D     0.600
2 San Francisco--Oakland           D     0.514
3 San Jose                         D     0.494
4 San Diego                        D     0.490
5 Riverside--San Bernardino        D     0.408
6 Sacramento                       D     0.369

2.2 Multi-group segregation indices

For a data set \(T\),

  • Mutual information index is \[ M(T) = \sum_{u=1}^U \sum_{g=1}^G p_{ug} \log \frac{p_{ug}}{p_u p_g}. \]

  • Theil’s \(H\) is \[ H(T) = \frac{M(T)}{E(T)}, \] where \(E(T)\) is the entropy of \(T\), normalizing \(H\) to be between 0 and 1.

mutual_within(
  data = ca_urban_data,
  group = "variable",
  unit = "GEOID",
  weight = "estimate",
  within = "urban_name",
  wide = TRUE
)
                         urban_name         M          p         H ent_ratio
1: Los Angeles--Long Beach--Anaheim 0.3391033 0.50163709 0.2851662 0.9693226
2:        Riverside--San Bernardino 0.1497129 0.08678082 0.1408461 0.8664604
3:                       Sacramento 0.1658898 0.07369482 0.1426804 0.9477412
4:                        San Diego 0.2290891 0.12560720 0.2025728 0.9218445
5:           San Francisco--Oakland 0.2685992 0.13945223 0.2116127 1.0346590
6:                         San Jose 0.2147445 0.07282785 0.1829190 0.9569681

2.3 Local segregation analysis

Patterns of segregation across the most segregated urban area, Los Angeles:

la_local_seg <- ca_urban_data %>%
  filter(urban_name == "Los Angeles--Long Beach--Anaheim") %>%
  mutual_local(
    group = "variable",
    unit = "GEOID",
    weight = "estimate", 
    wide = TRUE
  ) %>%
  print()
            GEOID         ls            p
   1: 06037101110 0.28218462 0.0003362648
   2: 06037101122 0.77904804 0.0002689957
   3: 06037101210 0.10121933 0.0005088495
   4: 06037101220 0.11823338 0.0002917425
   5: 06037101300 0.65382196 0.0003093895
  ---                                    
2769: 06071012700 0.05912488 0.0003110085
2770: 06111007405 0.49522902 0.0004648128
2771: 06111007511 0.43635422 0.0001729084
2772: 06111008304 0.32087536 0.0004579321
2773: 06111008305 0.36332355 0.0002955471
la_tracts_seg <- tracts("CA", cb = TRUE, year = 2019) %>%
  inner_join(la_local_seg, by = "GEOID") 

la_tracts_seg %>%
  ggplot(aes(fill = ls)) + 
  geom_sf(color = NA) + 
  coord_sf(crs = 26946) + 
  scale_fill_viridis_c(option = "inferno") + 
  theme_void() + 
  labs(fill = "Local\nsegregation index")

3 Regression modeling (Los Angeles)

Median home value by Census tract in the Los Angeles metropolitan area:

library(tidycensus)
library(sf)

variables_to_get <- c(
  median_value = "B25077_001",
  median_rooms = "B25018_001",
  median_income = "DP03_0062",
  total_population = "B01003_001",
  median_age = "B01002_001",
  pct_college = "DP02_0068P",
  pct_foreign_born = "DP02_0094P",
  pct_white = "DP05_0077P",
  median_year_built = "B25037_001",
  percent_ooh = "DP04_0046P"
)

lametro_data <- get_acs(
  geography = "tract",
  variables = variables_to_get,
  state = "CA",
  county = c("Los Angeles", "Orange"),
  geometry = TRUE,
  output = "wide",
  year = 2020
) %>%
  select(-NAME) %>%
  # slice(-c(572, 1939)) %>%
  st_transform(4267) %>%
  print(width = Inf)
Simple feature collection with 3112 features and 21 fields (with 4 geometries empty)
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: -118.9437 ymin: 32.80142 xmax: -117.4124 ymax: 34.82331
Geodetic CRS:  NAD27
# A tibble: 3,112 × 22
   GEOID       median_valueE median_valueM median_roomsE median_roomsM
 * <chr>               <dbl>         <dbl>         <dbl>         <dbl>
 1 06037101110        502200         51606           4.6           0.3
 2 06037101122        658800         21673           5.7           0.5
 3 06037101220        550800         39781           4.1           0.3
 4 06037101221        461600         75638           3.3           0.6
 5 06037101222        378400         93996           3.7           0.3
 6 06037101300        643200         24681           5.3           0.3
 7 06037101400        554300         55738           5.2           0.5
 8 06037102103        658200         50657           5             0.3
 9 06037102104        630200         44747           5.5           0.4
10 06037102105        545400         71790           4.6           0.4
   total_populationE total_populationM median_ageE median_ageM
 *             <dbl>             <dbl>       <dbl>       <dbl>
 1              3923               460        44.3         2.6
 2              4119               858        52.2         2.6
 3              3775               474        42.2         5  
 4              3787               651        40.5         6.1
 5              2717               442        39.3         3  
 6              3741               478        52.6         3.9
 7              3246               534        46.9         5.7
 8              1920               495        37.7         2.8
 9              4187               612        45.8         7.3
10              1797               336        42.2         4.7
   median_year_builtE median_year_builtM median_incomeE median_incomeM
 *              <dbl>              <dbl>          <dbl>          <dbl>
 1               1958                  3          74625          27799
 2               1964                  3          93125          15209
 3               1959                  5          55682           9264
 4               1975                  5          46274          13226
 5               1976                  9          30016           9714
 6               1958                  2          87066          27204
 7               1962                  5          66210          28061
 8               1958                  3          59005          15907
 9               1973                  2          98973          19597
10               1952                  6          82438          15646
   pct_collegeE pct_collegeM pct_foreign_bornE pct_foreign_bornM pct_whiteE
 *        <dbl>        <dbl>             <dbl>             <dbl>      <dbl>
 1         27.8          5.2              34.8               6.4       61  
 2         34.1          9.8              34.6               9.9       76.7
 3         26.2          7.2              44.4               8.7       43.9
 4         24            9.8              50.9               9.2       53.5
 5         17.1          7.4              55.7               9.6       54  
 6         33.2          7.1              45                 6.4       80.4
 7         30.9          6.4              34.6               7.5       69.5
 8         35.7          8.1              42.8               7.9       75  
 9         41            8.5              37.1               7.6       68.8
10         28.4          7.1              36.4               7.8       23  
   pct_whiteM percent_oohE percent_oohM
 *      <dbl>        <dbl>        <dbl>
 1        7.7         58.3          7.6
 2        8.1         74.9         10.4
 3        7.4         42.7          7.9
 4       10.4         20.4          9.2
 5       13.3         11.4          8.3
 6        6.9         86.8          5.6
 7        8.4         69.8          9.5
 8        9.7         57.9         14.6
 9        8.6         78.4          7.5
10        5.6         55.8         11.9
                                                                        geometry
 *                                                            <MULTIPOLYGON [°]>
 1 (((-118.2999 34.25961, -118.2999 34.26321, -118.297 34.26322, -118.2904 34.2…
 2 (((-118.3024 34.27381, -118.2988 34.27526, -118.2989 34.2767, -118.2965 34.2…
 3 (((-118.285 34.25227, -118.285 34.25589, -118.2841 34.25589, -118.2773 34.25…
 4 (((-118.2986 34.25598, -118.2923 34.25594, -118.2901 34.25593, -118.2864 34.…
 5 (((-118.2923 34.25232, -118.2877 34.2523, -118.2864 34.25331, -118.2864 34.2…
 6 (((-118.2773 34.25165, -118.276 34.25158, -118.276 34.25308, -118.2773 34.25…
 7 (((-118.3212 34.24958, -118.3158 34.2486, -118.3115 34.24887, -118.3062 34.2…
 8 (((-118.3644 34.2287, -118.3546 34.22852, -118.3498 34.2285, -118.3425 34.22…
 9 (((-118.3547 34.22003, -118.3529 34.22141, -118.3493 34.22136, -118.345 34.2…
10 (((-118.3522 34.21382, -118.3509 34.21514, -118.3476 34.21231, -118.3446 34.…
# … with 3,102 more rows

Visualization:

library(tidyverse)
library(patchwork)

mhv_map <- ggplot(lametro_data, aes(fill = median_valueE)) + 
  geom_sf(color = NA) + 
  scale_fill_viridis_c(labels = scales::label_dollar()) + 
  theme_void() + 
  labs(fill = "Median home value ")

mhv_histogram <- ggplot(lametro_data, aes(x = median_valueE)) + 
  geom_histogram(alpha = 0.5, fill = "navy", color = "navy",
                 bins = 100) + 
  theme_minimal() + 
  scale_x_continuous(labels = scales::label_number_si(accuracy = 0.1)) + 
  labs(x = "Median home value")
Warning: `label_number_si()` was deprecated in scales 1.2.0.
ℹ Please use the `scale_cut` argument of `label_number()` instead.
mhv_map + mhv_histogram
Warning: Removed 155 rows containing non-finite values (`stat_bin()`).

Log-transform:

library(tidyverse)
library(patchwork)

mhv_map_log <- ggplot(lametro_data, aes(fill = log(median_valueE))) + 
  geom_sf(color = NA) + 
  scale_fill_viridis_c() + 
  theme_void() + 
  labs(fill = "Median home\nvalue (log)")

mhv_histogram_log <- ggplot(lametro_data, aes(x = log(median_valueE))) + 
  geom_histogram(alpha = 0.5, fill = "navy", color = "navy",
                 bins = 100) + 
  theme_minimal() + 
  scale_x_continuous() + 
  labs(x = "Median home value (log)")

mhv_map_log + mhv_histogram_log
Warning: Removed 155 rows containing non-finite values (`stat_bin()`).

Feature engineering:

library(sf)
library(units)

lametro_data_for_model <- lametro_data %>%
  mutate(pop_density = as.numeric(set_units(total_populationE / st_area(.), "1/km2")),
         median_structure_age = 2018 - median_year_builtE) %>%
  select(!ends_with("M")) %>% 
  rename_with(.fn = ~str_remove(.x, "E$")) %>%
  drop_na() %>%
  slice(-c(572, 1939)) %>% # Catalina Island: Avalon
  print(width = Inf)
Simple feature collection with 2912 features and 13 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: -118.9437 ymin: 33.38776 xmax: -117.4124 ymax: 34.82331
Geodetic CRS:  NAD27
# A tibble: 2,912 × 14
   GEOID       median_value median_rooms total_population median_age
   <chr>              <dbl>        <dbl>            <dbl>      <dbl>
 1 06037101110       502200          4.6             3923       44.3
 2 06037101122       658800          5.7             4119       52.2
 3 06037101220       550800          4.1             3775       42.2
 4 06037101221       461600          3.3             3787       40.5
 5 06037101222       378400          3.7             2717       39.3
 6 06037101300       643200          5.3             3741       52.6
 7 06037101400       554300          5.2             3246       46.9
 8 06037102103       658200          5               1920       37.7
 9 06037102104       630200          5.5             4187       45.8
10 06037102105       545400          4.6             1797       42.2
   median_year_built median_income pct_college pct_foreign_born pct_white
               <dbl>         <dbl>       <dbl>            <dbl>     <dbl>
 1              1958         74625        27.8             34.8      61  
 2              1964         93125        34.1             34.6      76.7
 3              1959         55682        26.2             44.4      43.9
 4              1975         46274        24               50.9      53.5
 5              1976         30016        17.1             55.7      54  
 6              1958         87066        33.2             45        80.4
 7              1962         66210        30.9             34.6      69.5
 8              1958         59005        35.7             42.8      75  
 9              1973         98973        41               37.1      68.8
10              1952         82438        28.4             36.4      23  
   percent_ooh
         <dbl>
 1        58.3
 2        74.9
 3        42.7
 4        20.4
 5        11.4
 6        86.8
 7        69.8
 8        57.9
 9        78.4
10        55.8
                                                                        geometry
                                                              <MULTIPOLYGON [°]>
 1 (((-118.2999 34.25961, -118.2999 34.26321, -118.297 34.26322, -118.2904 34.2…
 2 (((-118.3024 34.27381, -118.2988 34.27526, -118.2989 34.2767, -118.2965 34.2…
 3 (((-118.285 34.25227, -118.285 34.25589, -118.2841 34.25589, -118.2773 34.25…
 4 (((-118.2986 34.25598, -118.2923 34.25594, -118.2901 34.25593, -118.2864 34.…
 5 (((-118.2923 34.25232, -118.2877 34.2523, -118.2864 34.25331, -118.2864 34.2…
 6 (((-118.2773 34.25165, -118.276 34.25158, -118.276 34.25308, -118.2773 34.25…
 7 (((-118.3212 34.24958, -118.3158 34.2486, -118.3115 34.24887, -118.3062 34.2…
 8 (((-118.3644 34.2287, -118.3546 34.22852, -118.3498 34.2285, -118.3425 34.22…
 9 (((-118.3547 34.22003, -118.3529 34.22141, -118.3493 34.22136, -118.345 34.2…
10 (((-118.3522 34.21382, -118.3509 34.21514, -118.3476 34.21231, -118.3446 34.…
   pop_density median_structure_age
         <dbl>                <dbl>
 1       3424.                   60
 2       1548.                   54
 3       5393.                   59
 4      10518.                   43
 5       9460.                   42
 6       1456.                   60
 7        513.                   56
 8       1621.                   60
 9       2508.                   45
10       3678.                   66
# … with 2,902 more rows

Linear regression:

formula <- "log(median_value) ~ median_rooms + median_income + pct_college + pct_foreign_born + pct_white + median_age + median_structure_age + percent_ooh + pop_density + total_population"

model1 <- lm(formula = formula, data = lametro_data_for_model)

summary(model1)

Call:
lm(formula = formula, data = lametro_data_for_model)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.68883 -0.11751  0.01886  0.13418  0.86833 

Coefficients:
                       Estimate Std. Error t value Pr(>|t|)    
(Intercept)           1.237e+01  5.007e-02 246.954  < 2e-16 ***
median_rooms          2.019e-02  9.279e-03   2.176 0.029613 *  
median_income         4.952e-06  2.918e-07  16.971  < 2e-16 ***
pct_college           9.325e-03  4.570e-04  20.406  < 2e-16 ***
pct_foreign_born      2.798e-03  4.880e-04   5.733 1.09e-08 ***
pct_white             1.050e-03  3.575e-04   2.938 0.003331 ** 
median_age            9.132e-03  1.027e-03   8.894  < 2e-16 ***
median_structure_age  9.255e-05  1.111e-05   8.333  < 2e-16 ***
percent_ooh          -5.492e-03  4.394e-04 -12.499  < 2e-16 ***
pop_density          -4.028e-06  1.925e-06  -2.092 0.036529 *  
total_population     -1.154e-05  3.205e-06  -3.600 0.000324 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.2544 on 2901 degrees of freedom
Multiple R-squared:  0.6683,    Adjusted R-squared:  0.6671 
F-statistic: 584.4 on 10 and 2901 DF,  p-value: < 2.2e-16

Inspect collinearity:

library(corrr)

lametro_estimates <- lametro_data_for_model %>%
  select(-GEOID, -median_value, -median_year_built) %>%
  st_drop_geometry()

correlations <- correlate(lametro_estimates, method = "pearson")

network_plot(correlations)

Collinearity can be diagnosed further by calculating the variance inflation factor (VIF) for the model, which takes into account not just pairwise correlations but the extent to which predictors are collinear with all other predictors. A VIF value of 1 indicates no collinearity; VIF values above 5 suggest a level of collinearity that has a problematic influence on model interpretation.

library(car)

vif(model1)
        median_rooms        median_income          pct_college 
            5.142594             5.153376             4.194830 
    pct_foreign_born            pct_white           median_age 
            1.818328             3.795816             2.257856 
median_structure_age          percent_ooh          pop_density 
            1.035021             5.455227             1.972569 
    total_population 
            1.085428 

Exercise: Remove median_income:

formula2 <- "log(median_value) ~ median_rooms + pct_college + pct_foreign_born + pct_white + median_age + median_structure_age + percent_ooh + pop_density + total_population"

model2 <- lm(formula = formula2, data = lametro_data_for_model)

summary(model2)

Call:
lm(formula = formula2, data = lametro_data_for_model)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.59893 -0.11996  0.02226  0.14036  0.85423 

Coefficients:
                       Estimate Std. Error t value Pr(>|t|)    
(Intercept)           1.229e+01  5.226e-02 235.099  < 2e-16 ***
median_rooms          8.255e-02  8.932e-03   9.241  < 2e-16 ***
pct_college           1.386e-02  3.886e-04  35.668  < 2e-16 ***
pct_foreign_born      2.763e-03  5.115e-04   5.402 7.14e-08 ***
pct_white             1.812e-03  3.718e-04   4.874 1.15e-06 ***
median_age            6.864e-03  1.067e-03   6.432 1.46e-10 ***
median_structure_age  9.175e-05  1.164e-05   7.881 4.56e-15 ***
percent_ooh          -3.967e-03  4.509e-04  -8.797  < 2e-16 ***
pop_density          -2.406e-06  2.016e-06  -1.194  0.23272    
total_population     -9.809e-06  3.358e-06  -2.921  0.00352 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.2667 on 2902 degrees of freedom
Multiple R-squared:  0.6353,    Adjusted R-squared:  0.6342 
F-statistic: 561.8 on 9 and 2902 DF,  p-value: < 2.2e-16
vif(model2)
        median_rooms          pct_college     pct_foreign_born 
            4.336348             2.760267             1.818296 
           pct_white           median_age median_structure_age 
            3.735957             2.219631             1.035003 
         percent_ooh          pop_density     total_population 
            5.226919             1.967710             1.084332 

3.1 PCA

PC1 captures the gradient that represents these social differences, with which multiple demographic characteristics will be associated.

pca <- prcomp(
  formula = ~., 
  data = lametro_estimates, 
  scale. = TRUE, 
  center = TRUE
)

summary(pca)
Importance of components:
                         PC1    PC2    PC3     PC4     PC5     PC6     PC7
Standard deviation     2.136 1.2074 1.0424 0.92760 0.86859 0.70091 0.61376
Proportion of Variance 0.456 0.1458 0.1087 0.08604 0.07545 0.04913 0.03767
Cumulative Proportion  0.456 0.6018 0.7105 0.79654 0.87199 0.92112 0.95879
                           PC8     PC9    PC10
Standard deviation     0.42617 0.34845 0.33031
Proportion of Variance 0.01816 0.01214 0.01091
Cumulative Proportion  0.97695 0.98909 1.00000

PC loadings:

pca_tibble <- pca$rotation %>%
  as_tibble(rownames = "predictor") %>%
  print()
# A tibble: 10 × 11
   predictor       PC1     PC2     PC3      PC4     PC5     PC6      PC7     PC8
   <chr>         <dbl>   <dbl>   <dbl>    <dbl>   <dbl>   <dbl>    <dbl>   <dbl>
 1 median_roo…  0.366  -0.400   0.148  -0.00847  0.0171 -0.355   1.04e-1 -0.279 
 2 total_popu… -0.0262 -0.413  -0.547   0.636    0.239   0.234   1.14e-1  0.0142
 3 median_age   0.341   0.118   0.165  -0.212    0.455   0.472   5.14e-1  0.226 
 4 median_inc…  0.414   0.0368 -0.0582  0.130    0.143  -0.446  -2.19e-1  0.200 
 5 pct_college  0.332   0.448  -0.185   0.148    0.235  -0.101  -3.20e-1  0.366 
 6 pct_foreig… -0.298  -0.108   0.130  -0.169    0.785  -0.0650 -3.66e-1 -0.308 
 7 pct_white    0.352   0.426  -0.202   0.0924  -0.0260  0.104   7.93e-2 -0.774 
 8 percent_ooh  0.371  -0.401   0.194  -0.0638   0.0490 -0.153   1.66e-1  0.0226
 9 pop_density -0.337   0.275  -0.125   0.0867   0.198  -0.588   6.28e-1  0.0272
10 median_str… -0.0604  0.156   0.710   0.680    0.0177  0.0616  2.69e-4 -0.0371
# … with 2 more variables: PC9 <dbl>, PC10 <dbl>
pca_tibble %>%
  select(predictor:PC5) %>%
  pivot_longer(PC1:PC5, names_to = "component", values_to = "value") %>%
  ggplot(aes(x = value, y = predictor)) + 
  geom_col(fill = "darkgreen", color = "darkgreen", alpha = 0.5) + 
  facet_wrap(~component, nrow = 1) + 
  labs(y = NULL, x = "Value") + 
  theme_minimal()

components <- predict(pca, lametro_estimates)

lametro_pca <- lametro_data_for_model %>%
  select(GEOID, median_value) %>%
  cbind(components) 

ggplot(lametro_pca, aes(fill = PC1)) +
  geom_sf(color = NA) +
  theme_void() +
  scale_fill_viridis_c()

The map, along with the bar chart, helps us understand how the multiple variables represent latent social processes at play in Los Angeles metro area. The brighter yellow areas, which have higher values for PC1, are located in communities like Beverley, Malibu, Rancho Palos Verdes. These communities are segregated, predominantly non-Hispanic white, and are among the wealthiest neighborhoods in the entire United States.

PC regression:

pca_formula <- paste0("log(median_value) ~ ", 
                      paste0('PC', 1:6, collapse = ' + '))

pca_model <- lm(formula = pca_formula, data = lametro_pca)

summary(pca_model)

Call:
lm(formula = pca_formula, data = lametro_pca)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.44114 -0.12239  0.01819  0.14387  0.83395 

Coefficients:
             Estimate Std. Error  t value Pr(>|t|)    
(Intercept) 13.339410   0.004998 2668.984  < 2e-16 ***
PC1          0.121129   0.002341   51.748  < 2e-16 ***
PC2          0.165475   0.004140   39.968  < 2e-16 ***
PC3         -0.020855   0.004795   -4.349 1.41e-05 ***
PC4          0.059137   0.005389   10.974  < 2e-16 ***
PC5          0.116333   0.005755   20.214  < 2e-16 ***
PC6         -0.051495   0.007132   -7.220 6.59e-13 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.2697 on 2905 degrees of freedom
Multiple R-squared:  0.6266,    Adjusted R-squared:  0.6259 
F-statistic: 812.6 on 6 and 2905 DF,  p-value: < 2.2e-16

4 Spatial regression

4.1 Test spatial correlation

lametro_data_for_model$residuals <- residuals(model2)

ggplot(lametro_data_for_model, aes(x = residuals)) + 
  geom_histogram(bins = 100, alpha = 0.5, color = "navy",
                 fill = "navy") + 
  theme_minimal()

Moran’s I test for spatial correlation:

library(spdep)

wts <- lametro_data_for_model %>%
  poly2nb() %>%
  nb2listw()

moran.test(lametro_data_for_model$residuals, wts)

    Moran I test under randomisation

data:  lametro_data_for_model$residuals  
weights: wts    

Moran I statistic standard deviate = 35.823, p-value < 2.2e-16
alternative hypothesis: greater
sample estimates:
Moran I statistic       Expectation          Variance 
     0.3895827002     -0.0003435246      0.0001184783 
lametro_data_for_model$lagged_residuals <- lag.listw(wts, lametro_data_for_model$residuals)

ggplot(lametro_data_for_model, aes(x = residuals, y = lagged_residuals)) + 
  theme_minimal() + 
  geom_point(alpha = 0.5) + 
  geom_smooth(method = "lm", color = "red")

4.2 Spatial lag model

Spatial lag model: \[ Y_i = \alpha + \rho Y_{\text{lag}-i} + \sum_k \beta_k X_{ki} + \epsilon_i, \] where \(w_{ij}\) represents the spatial weights, \(\rho\) measures the effect of the spatial lag in the outcome variable, and \(k\) is the number of predictors in the model.

library(spatialreg)

lag_model <- lagsarlm(
  formula = formula2, 
  data = lametro_data_for_model, 
  listw = wts
)

summary(lag_model, Nagelkerke = TRUE)

Call:lagsarlm(formula = formula2, data = lametro_data_for_model, listw = wts)

Residuals:
      Min        1Q    Median        3Q       Max 
-2.248426 -0.088715  0.014409  0.109125  0.860765 

Type: lag 
Coefficients: (asymptotic standard errors) 
                        Estimate  Std. Error z value  Pr(>|z|)
(Intercept)           4.3404e+00  2.0286e-01 21.3962 < 2.2e-16
median_rooms          9.1672e-02  7.1761e-03 12.7747 < 2.2e-16
pct_college           5.9231e-03  3.5537e-04 16.6672 < 2.2e-16
pct_foreign_born      1.4557e-03  4.0837e-04  3.5647 0.0003643
pct_white             1.7500e-04  3.0044e-04  0.5825 0.5602397
median_age            2.6586e-03  8.4933e-04  3.1302 0.0017469
median_structure_age  5.8519e-05  9.2648e-06  6.3163 2.678e-10
percent_ooh          -2.9268e-03  3.6095e-04 -8.1086 4.441e-16
pop_density          -5.9927e-06  1.6051e-06 -3.7336 0.0001888
total_population     -8.4036e-06  2.6684e-06 -3.1493 0.0016366

Rho: 0.62897, LR test value: 1086.5, p-value: < 2.22e-16
Asymptotic standard error: 0.015948
    z-value: 39.439, p-value: < 2.22e-16
Wald statistic: 1555.5, p-value: < 2.22e-16

Log likelihood: 265.2101 for lag model
ML residual variance (sigma squared): 0.044866, (sigma: 0.21182)
Nagelkerke pseudo-R-squared: 0.74891 
Number of observations: 2912 
Number of parameters estimated: 12 
AIC: -506.42, (AIC for lm: 578.1)
LM test for residual autocorrelation
test value: 0.13424, p-value: 0.71407

4.3 Spatial error models

Spatial error models: \[ Y_i = \alpha + \sum_k \beta_k X_{ki} + u_i, \] where \[ u_i = \lambda u_{\text{lag}-i} + \epsilon_i \] and \[ u_{\text{lag}-i} = \sum_j w_{ij} u_j. \]

error_model <- errorsarlm(
  formula = formula2, 
  data = lametro_data_for_model, 
  listw = wts
)

summary(error_model, Nagelkerke = TRUE)

Call:errorsarlm(formula = formula2, data = lametro_data_for_model, 
    listw = wts)

Residuals:
        Min          1Q      Median          3Q         Max 
-2.30256109 -0.08818706 -0.00055176  0.09152501  0.96055165 

Type: error 
Coefficients: (asymptotic standard errors) 
                        Estimate  Std. Error  z value  Pr(>|z|)
(Intercept)           1.2543e+01  5.2188e-02 240.3439 < 2.2e-16
median_rooms          1.4035e-01  7.6875e-03  18.2575 < 2.2e-16
pct_college           6.3327e-03  4.7800e-04  13.2484 < 2.2e-16
pct_foreign_born      8.1680e-04  6.3353e-04   1.2893  0.197298
pct_white             2.6800e-03  5.0010e-04   5.3589 8.372e-08
median_age            5.1477e-04  9.2398e-04   0.5571  0.577443
median_structure_age  2.7533e-05  1.0376e-05   2.6535  0.007965
percent_ooh          -3.2714e-03  3.8320e-04  -8.5370 < 2.2e-16
pop_density          -1.6847e-05  1.7724e-06  -9.5049 < 2.2e-16
total_population      3.4308e-07  2.6632e-06   0.1288  0.897498

Lambda: 0.7956, LR test value: 1247, p-value: < 2.22e-16
Asymptotic standard error: 0.013441
    z-value: 59.191, p-value: < 2.22e-16
Wald statistic: 3503.6, p-value: < 2.22e-16

Log likelihood: 345.4269 for error model
ML residual variance (sigma squared): 0.039582, (sigma: 0.19895)
Nagelkerke pseudo-R-squared: 0.76237 
Number of observations: 2912 
Number of parameters estimated: 12 
AIC: NA (not available for weighted model), (AIC for lm: 578.1)

Choosing between spatial lag and spatial error models:

moran.test(lag_model$residuals, wts)

    Moran I test under randomisation

data:  lag_model$residuals  
weights: wts    

Moran I statistic standard deviate = -0.19565, p-value = 0.5776
alternative hypothesis: greater
sample estimates:
Moran I statistic       Expectation          Variance 
    -0.0024709813     -0.0003435246      0.0001182411 
moran.test(error_model$residuals, wts)

    Moran I test under randomisation

data:  error_model$residuals  
weights: wts    

Moran I statistic standard deviate = -8.7047, p-value = 1
alternative hypothesis: greater
sample estimates:
Moran I statistic       Expectation          Variance 
    -0.0949433026     -0.0003435246      0.0001181065 

Lagrange multiplier tests:

lm.LMtests(
  model2, 
  wts, 
  test = c("LMerr", "LMlag", "RLMerr", "RLMlag")
)

    Lagrange multiplier diagnostics for spatial dependence

data:  
model: lm(formula = formula2, data = lametro_data_for_model)
weights: wts

LMerr = 1274.6, df = 1, p-value < 2.2e-16


    Lagrange multiplier diagnostics for spatial dependence

data:  
model: lm(formula = formula2, data = lametro_data_for_model)
weights: wts

LMlag = 1271, df = 1, p-value < 2.2e-16


    Lagrange multiplier diagnostics for spatial dependence

data:  
model: lm(formula = formula2, data = lametro_data_for_model)
weights: wts

RLMerr = 136.37, df = 1, p-value < 2.2e-16


    Lagrange multiplier diagnostics for spatial dependence

data:  
model: lm(formula = formula2, data = lametro_data_for_model)
weights: wts

RLMlag = 132.79, df = 1, p-value < 2.2e-16

5 Geographically weighted regression

Choose the number of nearest neighbors by CV:

library(GWmodel)
library(sf)

lametro_data_sp <- lametro_data_for_model %>%
  as_Spatial()

bw <- bw.gwr(
  formula = formula2, 
  data = lametro_data_sp, 
  kernel = "bisquare",
  adaptive = TRUE
)
Take a cup of tea and have a break, it will take a few minutes.
          -----A kind suggestion from GWmodel development group
Adaptive bandwidth: 1807 CV score: 160.4359 
Adaptive bandwidth: 1125 CV score: 148.0399 
Adaptive bandwidth: 702 CV score: 137.977 
Adaptive bandwidth: 442 CV score: 148.5875 
Adaptive bandwidth: 864 CV score: 142.1575 
Adaptive bandwidth: 603 CV score: 134.9265 
Adaptive bandwidth: 540 CV score: 134.1407 
Adaptive bandwidth: 503 CV score: 139.8375 
Adaptive bandwidth: 565 CV score: 134.046 
Adaptive bandwidth: 578 CV score: 134.271 
Adaptive bandwidth: 554 CV score: 133.8688 
Adaptive bandwidth: 550 CV score: 133.9889 
Adaptive bandwidth: 559 CV score: 133.9599 
Adaptive bandwidth: 553 CV score: 133.8772 
Adaptive bandwidth: 556 CV score: 133.9004 
Adaptive bandwidth: 553 CV score: 133.8772 
Adaptive bandwidth: 554 CV score: 133.8688 

Fitting and evaluating the GWR model

formula2 <- "log(median_value) ~ median_rooms + pct_college + pct_foreign_born + pct_white + median_age + median_structure_age + percent_ooh + pop_density + total_population"

gw_model <- gwr.basic(
  formula = formula2, 
  data = lametro_data_sp, 
  bw = bw,
  kernel = "bisquare",
  adaptive = TRUE
)
names(gw_model)
[1] "GW.arguments"  "GW.diagnostic" "lm"            "SDF"          
[5] "timings"       "this.call"     "Ftests"       
gw_model_results <- gw_model$SDF %>%
  st_as_sf() 

names(gw_model_results)
 [1] "Intercept"               "median_rooms"           
 [3] "pct_college"             "pct_foreign_born"       
 [5] "pct_white"               "median_age"             
 [7] "median_structure_age"    "percent_ooh"            
 [9] "pop_density"             "total_population"       
[11] "y"                       "yhat"                   
[13] "residual"                "CV_Score"               
[15] "Stud_residual"           "Intercept_SE"           
[17] "median_rooms_SE"         "pct_college_SE"         
[19] "pct_foreign_born_SE"     "pct_white_SE"           
[21] "median_age_SE"           "median_structure_age_SE"
[23] "percent_ooh_SE"          "pop_density_SE"         
[25] "total_population_SE"     "Intercept_TV"           
[27] "median_rooms_TV"         "pct_college_TV"         
[29] "pct_foreign_born_TV"     "pct_white_TV"           
[31] "median_age_TV"           "median_structure_age_TV"
[33] "percent_ooh_TV"          "pop_density_TV"         
[35] "total_population_TV"     "Local_R2"               
[37] "geometry"               
ggplot(gw_model_results, aes(fill = Local_R2)) + 
  geom_sf(color = NA) + 
  scale_fill_viridis_c() + 
  theme_void()

Percentage owner-occupied housing:

ggplot(gw_model_results, aes(fill = percent_ooh)) + 
  geom_sf(color = NA) + 
  scale_fill_viridis_c() + 
  theme_void() + 
  labs(fill = "Local β for \npercent_ooh")

Population density:

ggplot(gw_model_results, aes(fill = pop_density)) + 
  geom_sf(color = NA) + 
  scale_fill_viridis_c() + 
  theme_void() + 
  labs(fill = "Local β for \npopulation density")

6 Geodemographic clustering

6.1 Aspatial K-means

set.seed(1983)

lametro_kmeans <- lametro_pca %>%
  st_drop_geometry() %>%
  select(PC1:PC8) %>%
  kmeans(centers = 6)

table(lametro_kmeans$cluster)

  1   2   3   4   5   6 
738 144 380 455 612 583 
lametro_clusters <- lametro_pca %>%
  mutate(cluster = as.character(lametro_kmeans$cluster))

ggplot(lametro_clusters, aes(fill = cluster)) + 
  geom_sf(size = 0.1) + 
  scale_fill_brewer(palette = "Set1") + 
  theme_void() + 
  labs(fill = "Cluster ")

PCA plot. PC1, which is a gradient from affluent/older/white to lower-income/younger/nonwhite, and PC2, which represents areas with high population densities and educational attainment on the low end to lower-density, less educated areas on the high end.

library(plotly)

cluster_plot <- ggplot(lametro_clusters, 
                       aes(x = PC1, y = PC2, color = cluster)) + 
  geom_point() + 
  scale_color_brewer(palette = "Set1") + 
  theme_minimal()

ggplotly(cluster_plot) %>%
  layout(legend = list(orientation = "h", y = -0.15, 
                       x = 0.2, title = "Cluster"))

6.2 SKATER

In other applications, an analyst may want to generate meaningful clusters that are constrained to be neighboring or contiguous areas. An application of this workflow might be sales territory generation, where sales representatives will be assigned to communities in which they have local market knowledge but also want to minimize overall travel time.

SKATER (Spatial ’K’luster Analysis by Tree Edge Removal) algorithm:

library(spdep)
library(bigDM)

input_vars <- lametro_pca %>%
  select(PC1:PC8) %>%
  st_drop_geometry() %>%
  as.data.frame() 

skater_nbrs <- poly2nb(lametro_pca, queen = TRUE) # 2 disjoint connected subgraphs
skater_nbrs_mod <- connect_subgraphs(carto = lametro_pca, ID.area = "GEOID", nb = skater_nbrs)$nb
Searching for isolated areas:
  0 region(s) with no links

Searching for disjoint connected subgraphs:
  2 disjoint connected subgraphs
costs <- nbcosts(skater_nbrs_mod, input_vars)
skater_weights <- nb2listw(skater_nbrs_mod, costs, style = "B")
mst <- mstree(skater_weights)

regions <- skater(
  mst[, 1:2],
  input_vars, 
  ncuts = 8,
  crit = 10
)
lametro_clusters$region <- as.character(regions$group)

ggplot(lametro_clusters, aes(fill = region)) + 
  geom_sf(size = 0.1) + 
  scale_fill_brewer(palette = "Set1") + 
  theme_void()